#######################################
##########  Chase Joyner     ##########
##########  885 Homework 2   ##########
##########  October 1, 2015  ##########
#######################################

################################
########## Problem 1a ##########
################################
n = c(10, 50, 100);
iter = 1000;
p = .1
c = 3;
alpha = .05;
results = matrix(NA, nrow = length(n), ncol = 4);
colnames(results) = c("Estimate", "ASE", "LB", "UB");
res = matrix(NA, nrow = iter, ncol = 4);

prob1a = function(Z, c, alpha){

	## Preliminaries ##
	n = length(Z);	

	## MLE of Pstar ##
	pstarhat = mean(Z);

	## By invariance property of MLEs ##
	phat = 1 - (1 - pstarhat)^(1 / c);

	## Asymptotic std error ##
	p.se = (1 / c) * sqrt((pstarhat * (1 - pstarhat)^((2 - c) / c)) / n);

	## CI for p ##
	UB = phat + qnorm(1 - alpha / 2) * p.se;
	LB = phat - qnorm(1 - alpha / 2) * p.se;

	return(c(phat, p.se, LB, UB));
}
	
################################
########## Problem 1b ##########
################################
J = c(25, 50, 100, 200);
c = c(4, 6, 8, 10);
p = seq(.001, .2, .001);
alpha = 0.05;

## Number of data sets to be generated ##
n = 1000;

## Hold results of 1000 data sets for (J,c,p) ##
res = matrix(NA, nrow = n, ncol = 4);

## Create three arrays, one for each figure ##
## Each array has length(J) elements, consisting of ##
## matrix with length(p) rows, length(c) cols ##
## where each component is desired result to ##
## be plotted in figures ##
arr1 = array(NA, c(length(p), length(c), length(J)));
arr2 = array(NA, c(length(p), length(c), length(J)));
arr3 = array(NA, c(length(p), length(c), length(J)));


for(i in 1:length(J)){

	for(j in 1:length(c)){

		for(k in 1:length(p)){

			## For coverage probabilites ##
			count = 0;

			for(l in 1:n){
				
				data = matrix(rbinom(J[i] * c[j], 1, p[k]), nrow = J[i], ncol = c[j]);
				Z = apply(data, 1, max);
				res[l,] = prob1a(Z, c[j], alpha);
				count = count + (p[k] >= res[l,3] && p[k] <= res[l,4]); 
			}
			
			empiricals = apply(res, 2, mean);
			arr1[k,j,i] = empiricals[1] - p[k];
			arr2[k,j,i] = sd(res[,1]) - empiricals[2];
			arr3[k,j,i] = count / n;

			print(c(i,j,k));
		}

	}

}


######## Figure 1 ########
par(mfrow = c(2,2));
plot(p, as.vector(arr1[,1,1]), ylab = "bias", ylim = c(-.05, .05), main = "J = 25", type = "l", col = "red");
lines(p, as.vector(arr1[,2,1]), col = "green");
lines(p, as.vector(arr1[,3,1]), col = "blue");
lines(p, as.vector(arr1[,4,1]), col = "black");

plot(p, as.vector(arr1[,1,2]), ylab = "bias", ylim = c(-.05, .05), main = "J = 50", type = "l", col = "red");
lines(p, as.vector(arr1[,2,2]), col = "green");
lines(p, as.vector(arr1[,3,2]), col = "blue");
lines(p, as.vector(arr1[,4,2]), col = "black");

plot(p, as.vector(arr1[,1,3]), ylab = "bias", ylim = c(-.05, .05), main = "J = 100", type = "l", col = "red");
lines(p, as.vector(arr1[,2,3]), col = "green");
lines(p, as.vector(arr1[,3,3]), col = "blue");
lines(p, as.vector(arr1[,4,3]), col = "black");

plot(p, as.vector(arr1[,1,4]), ylab = "bias", ylim = c(-.05, .05), main = "J = 200", type = "l", col = "red");
lines(p, as.vector(arr1[,2,4]), col = "green");
lines(p, as.vector(arr1[,3,4]), col = "blue");
lines(p, as.vector(arr1[,4,4]), col = "black");


######## Figure 2 ########
par(mfrow = c(2,2));
plot(p, as.vector(arr2[,1,1]), ylab = "Std. Error Differences", ylim = c(-.005,.005), main = "J = 25", type = "l", col = "red");
lines(p, as.vector(arr2[,2,1]), col = "green");
lines(p, as.vector(arr2[,3,1]), col = "blue");
lines(p, as.vector(arr2[,4,1]), col = "black");

plot(p, as.vector(arr2[,1,2]), ylab = "Std. Error Differences", ylim = c(-.005,.005), main = "J = 50", type = "l", col = "red");
lines(p, as.vector(arr2[,2,2]), col = "green");
lines(p, as.vector(arr2[,3,2]), col = "blue");
lines(p, as.vector(arr2[,4,2]), col = "black");

plot(p, as.vector(arr2[,1,3]), ylab = "Std. Error Differences", ylim = c(-.005,.005), main = "J = 100", type = "l", col = "red");
lines(p, as.vector(arr2[,2,3]), col = "green");
lines(p, as.vector(arr2[,3,3]), col = "blue");
lines(p, as.vector(arr2[,4,3]), col = "black");

plot(p, as.vector(arr2[,1,4]), ylab = "Std. Error Differences", ylim = c(-.005,.005), main = "J = 200", type = "l", col = "red");
lines(p, as.vector(arr2[,2,4]), col = "green");
lines(p, as.vector(arr2[,3,4]), col = "blue");
lines(p, as.vector(arr2[,4,4]), col = "black");


######## Figure 3 ########
par(mfrow = c(2,2));
plot(p, as.vector(arr3[,1,1]), ylab = "Coverage", ylim = c(0,1), main = "J = 25", type = "l", col = "red");
lines(p, as.vector(arr3[,2,1]), col = "green");
lines(p, as.vector(arr3[,3,1]), col = "blue");
lines(p, as.vector(arr3[,4,1]), col = "black");
abline(1-alpha,0);

plot(p, as.vector(arr3[,1,2]), ylab = "Coverage", ylim = c(0,1), main = "J = 50", type = "l", col = "red");
lines(p, as.vector(arr3[,2,2]), col = "green");
lines(p, as.vector(arr3[,3,2]), col = "blue");
lines(p, as.vector(arr3[,4,2]), col = "black");
abline(1-alpha,0);

plot(p, as.vector(arr3[,1,3]), ylab = "Coverage", ylim = c(0,1), main = "J = 100", type = "l", col = "red");
lines(p, as.vector(arr3[,2,3]), col = "green");
lines(p, as.vector(arr3[,3,3]), col = "blue");
lines(p, as.vector(arr3[,4,3]), col = "black");
abline(1-alpha,0);

plot(p, as.vector(arr3[,1,4]), ylab = "Coverage", ylim = c(0,1), main = "J = 200", type = "l", col = "red");
lines(p, as.vector(arr3[,2,4]), col = "green");
lines(p, as.vector(arr3[,3,4]), col = "blue");
lines(p, as.vector(arr3[,4,4]), col = "black");
abline(1-alpha,0);


################################
########## Problem 1c ##########
################################
prob1c = function(Z, cj){

	## Create log-likelihood function as a function ##
	llik = function(p, Z, Cj){
                pz = Se + (1 - Se - Sp) * (1 - p)^Cj;
		res = -sum(Z * log(pz) + (1 - Z) * log(1-pz));
		return(res);
	}
	
	## Optimize the llik function ##
	p.hat = optimize(llik, c(.001,1), Cj = cj, Z = Z)$minimum;

	## Asymptotic variance ##
	pjss = Se + (1 - p.hat)^cj * (1 - Se - Sp);
	dp = -cj * (1 - Se - Sp) * (1 - p.hat)^(cj - 1);
	dp2 = cj * (cj - 1) * (1 - Se - Sp) * (1 - p.hat)^(cj - 2);
	var = -sum(Z * (-1 / (pjss^2) * dp^2 + 1 / pjss * dp2) - (1 - Z) * (1 / (1 - pjss)^2 * dp^2 + 1 / (1 - pjss) * dp2));

	## Confidence Interval ##
	LB = p.hat - qnorm(1 - alpha / 2) * sqrt(var^(-1));
	UB = p.hat + qnorm(1 - alpha / 2) * sqrt(var^(-1));	

	## Calculate second way ##
	# Se = sqrt((1 / length(Z)) * (hessian(llik, p.hat, Cj = cj, Z = Z))^(-1));

	return(c(p.hat, sqrt(1 / var), LB, UB));
}


################################
########## Problem 1d ##########
################################
J = c(25, 50, 100, 200);
lambda = c(2, 4, 6, 8);
iter = 100;
alpha = 0.05;
p = seq(.001, .2, .001);
Se = 0.95;
Sp = 0.98;
res = matrix(NA, nrow = iter, ncol = 4);

## Create three arrays, one for each figure ##
## Each array has length(J) elements, consisting of ##
## matrix with length(p) rows, length(lambda) cols ##
## where each component is desired result to ##
## be plotted in figures ##
arr1 = array(NA, c(length(p), length(lambda), length(J)));
arr2 = array(NA, c(length(p), length(lambda), length(J)));
arr3 = array(NA, c(length(p), length(lambda), length(J)));

for(j in 1:length(J)){
	
	for(k in 1:length(lambda)){

		for(l in 1:length(p)){

			## For coverage probabilities ##
			count = 0;

			for(i in 1:iter){
				cj = 2 + rpois(J[j], lambda[k]);
				pstarstar = Se + (1 - Se - Sp) * (1 - p[l])^cj;
				Z = rbinom(J[j], 1, pstarstar);

				res[i,] = prob1c(Z, cj);

				count = count + (p[l] >= res[i,3] && p[l] <= res[i,4]); 
			}

			empiricals = apply(res, 2, mean);
			arr1[l,k,j] = empiricals[1] - p[l];
			arr2[l,k,j] = sd(res[,1]) - empiricals[2];
			arr3[l,k,j] = count / iter;
			print(c(j,k,l));
		}
	}
}

######## Figure 1 ########
par(mfrow = c(2,2));
plot(p, as.vector(arr1[,1,1]), ylab = "bias", ylim = c(-.05, .05), main = "J = 25", type = "l", col = "red");
lines(p, as.vector(arr1[,2,1]), col = "green");
lines(p, as.vector(arr1[,3,1]), col = "blue");
lines(p, as.vector(arr1[,4,1]), col = "black");

plot(p, as.vector(arr1[,1,2]), ylab = "bias", ylim = c(-.05, .05), main = "J = 50", type = "l", col = "red");
lines(p, as.vector(arr1[,2,2]), col = "green");
lines(p, as.vector(arr1[,3,2]), col = "blue");
lines(p, as.vector(arr1[,4,2]), col = "black");

plot(p, as.vector(arr1[,1,3]), ylab = "bias", ylim = c(-.05, .05), main = "J = 100", type = "l", col = "red");
lines(p, as.vector(arr1[,2,3]), col = "green");
lines(p, as.vector(arr1[,3,3]), col = "blue");
lines(p, as.vector(arr1[,4,3]), col = "black");

plot(p, as.vector(arr1[,1,4]), ylab = "bias", ylim = c(-.05, .05), main = "J = 200", type = "l", col = "red");
lines(p, as.vector(arr1[,2,4]), col = "green");
lines(p, as.vector(arr1[,3,4]), col = "blue");
lines(p, as.vector(arr1[,4,4]), col = "black");


######## Figure 2 ########
par(mfrow = c(2,2));
plot(p, as.vector(arr2[,1,1]), ylab = "Std. Error Differences", ylim = c(-.005, .005), main = "J = 25", type = "l", col = "red");
lines(p, as.vector(arr2[,2,1]), col = "green");
lines(p, as.vector(arr2[,3,1]), col = "blue");
lines(p, as.vector(arr2[,4,1]), col = "black");

plot(p, as.vector(arr2[,1,2]), ylab = "Std. Error Differences", ylim = c(-.005, .005), main = "J = 50", type = "l", col = "red");
lines(p, as.vector(arr2[,2,2]), col = "green");
lines(p, as.vector(arr2[,3,2]), col = "blue");
lines(p, as.vector(arr2[,4,2]), col = "black");

plot(p, as.vector(arr2[,1,3]), ylab = "Std. Error Differences", ylim = c(-.005, .005), main = "J = 100", type = "l", col = "red");
lines(p, as.vector(arr2[,2,3]), col = "green");
lines(p, as.vector(arr2[,3,3]), col = "blue");
lines(p, as.vector(arr2[,4,3]), col = "black");

plot(p, as.vector(arr2[,1,4]), ylab = "Std. Error Differences", ylim = c(-.005, .005), main = "J = 200", type = "l", col = "red");
lines(p, as.vector(arr2[,2,4]), col = "green");
lines(p, as.vector(arr2[,3,4]), col = "blue");
lines(p, as.vector(arr2[,4,4]), col = "black");


######## Figure 3 ########
par(mfrow = c(2,2));
plot(p, as.vector(arr3[,1,1]), ylab = "Coverage", ylim = c(0,1), main = "J = 25", type = "l", col = "red");
lines(p, as.vector(arr3[,2,1]), col = "green");
lines(p, as.vector(arr3[,3,1]), col = "blue");
lines(p, as.vector(arr3[,4,1]), col = "black");
abline(1-alpha,0);

plot(p, as.vector(arr3[,1,2]), ylab = "Coverage", ylim = c(0,1), main = "J = 50", type = "l", col = "red");
lines(p, as.vector(arr3[,2,2]), col = "green");
lines(p, as.vector(arr3[,3,2]), col = "blue");
lines(p, as.vector(arr3[,4,2]), col = "black");
abline(1-alpha,0);

plot(p, as.vector(arr3[,1,3]), ylab = "Coverage", ylim = c(0,1), main = "J = 100", type = "l", col = "red");
lines(p, as.vector(arr3[,2,3]), col = "green");
lines(p, as.vector(arr3[,3,3]), col = "blue");
lines(p, as.vector(arr3[,4,3]), col = "black");
abline(1-alpha,0);

plot(p, as.vector(arr3[,1,4]), ylab = "Coverage", ylim = c(0,1), main = "J = 200", type = "l", col = "red");
lines(p, as.vector(arr3[,2,4]), col = "green");
lines(p, as.vector(arr3[,3,4]), col = "blue");
lines(p, as.vector(arr3[,4,4]), col = "black");
abline(1-alpha,0);


########################################################################################
####################                   Problem 2a                   #################### 
########################################################################################
prob2a = function(Z, X, c, se, sp, alpha){

	llik = function(B, Z, X, Se, Sp, c){

		## To create pjstar ##
		pindex = NULL;
		#for(i in 1:length(c)){
#
#			for(j in 1:c[i]){
#
#				pindex = append(pindex, i);
#			}
#		}
		
		pindex = c;
		
		pij = exp(X %*% B) / (1 + exp(X %*% B));
		pjs = 1 - by(1-pij, pindex, prod);

		pjss = Se * pjs + (1 - Sp) * (1 - pjs);

		res = -sum(Z * log(pjss) + (1 - Z) * log(1 - pjss));

		return(res);
	}

	res = optim(rep(0, ncol(X)), llik, method = "Nelder-Mead", Z = Z, X = X, Se = se, Sp = sp, c = c, hessian = TRUE);

	betahat = res$par;

	## Asymptotic Information Matrix  and ASE ##
	I.mat = solve(res$hessian);
	SE = sqrt(diag(I.mat));

	## Confidence Interval ##
	LB = betahat - qnorm(1 - alpha / 2) * SE;
	UB = betahat + qnorm(1 - alpha / 2) * SE;

	## Marginal Wald Tests and LRTs ##
	## H0: Beta_k = 0 vs H1: Beta_k != 0 ##
	W = rejectW = pval = rejP = pLR = rejPLR = LR = rejL = rep(NA, ncol(X));

	for(i in 1:ncol(X)){
		R1 = t(rep(0, ncol(X)));
		R1[i] = 1;
		W[i] = t(R1 %*% betahat) %*% solve(R1 %*% I.mat %*% t(R1)) %*% (R1 %*% betahat);
	
		## Calculate rejection region ##
		rejectW[i] = W[i] > qchisq(0.975, 1) || W[i] < qchisq(0.025, 1);

		## Pvalue ##
		pval[i] = 2 * (1 - pnorm(abs(W[i])));
		rejP[i] = pval[i] < alpha;

		## LRT ##
		Xred = X[,-i];
		res.red = optim(rep(0, ncol(Xred)), llik, method = "Nelder-Mead", Z = Z, X = Xred, Se = se, Sp = sp, c = c);
		LR[i] = -2 * res.red$value / res$value;
		rejL[i] = LR[i] > qchisq(0.975, 1) || LR[i] < qchisq(0.025, 1);
		pLR[i] = pchisq(LR[i], 1);
		rejPLR[i] = pLR[i] < alpha;
			
	}

	## H0: Beta2 = ... = Betap = 0 vs H1: at least one not 0 -- Wald Test and LRT ##
	R2 = diag(ncol(X));
	R2 = R2[-1,];
	r = rep(0, ncol(X));
	
	W2 = t(R2 %*% betahat) %*% solve(R2 %*% I.mat %*% t(R2)) %*% (R2 %*% betahat);
	Full.test = W > qchisq(0.975, 1) || W < qchisq(0.025, 1);
	
	return(list(MLE = betahat, Var = SE, LB = LB, UB = UB, Imat = I.mat, Wald = W, RejectW = rejectW, P = pval, RejectP = rejP, LRT = LR, P.LR = pLR, RejectPLR = rejPLR, RejectL = rejL, Wald.Full = W2, Full = Full.test));
}


J = 25
lambda = 4
## Number of data sets to be generated ##
iter = 10;
## True beta ##
Beta = c(-1, 0.75, 1.3);
alpha = .05;
Se = .95;
Sp = .98;

## Matrix to save results of each (lambda, J) ##
res = list();

cj = 2 + rpois(J, lambda);
X = matrix(NA, nrow = sum(cj), ncol = length(Beta));
X[,1] = 1;
X[,2] = rnorm(sum(cj));
X[,3] = rnorm(sum(cj));
pij = exp(X %*% Beta) / (1 + exp(X %*% Beta));

pindex = NULL;
for(i in 1:length(cj)){

	for(h in 1:cj[i]){
		pindex = append(pindex, i);
	}
}

for(k in 1:iter){

	data = rbinom(sum(cj), 1, pij);
	Zj = by(data, pindex, max);
	Zj = as.vector(Zj);
	Z = rbinom(length(Zj), 1, Se * Zj + (1 - Sp) * (1 - Zj));
	res[[k]] = prob2a(Z, X, cj, Se, Sp, alpha);
			
	print(c(k));	
}


##### Create tables for parts (i), (ii), (iii) #####
## Table 1 will have mean MLE, mean ASE, mean LB, mean UB of iterations ##
## Table 2 will have mean marginal Wald stat, mean p value ##
Table1 = matrix(NA, nrow = 4, ncol = length(Beta));
Table2 = matrix(NA, nrow = 2, ncol = length(Beta));
Table3 = matrix(NA, nrow = 2, ncol = 1);

tmp1 = tmp2 = tmp3 = tmp4 = 0;
for(i in 1:iter){
	tmp1 = tmp1 + res[[i]]$MLE;
	tmp2 = tmp2 + res[[i]]$Var;
	tmp3 = tmp3 + res[[i]]$LB;
	tmp4 = tmp4 + res[[i]]$UB;
}
Table1[1,] = tmp1 / iter;
Table1[2,] = tmp2 / iter;
Table1[3,] = tmp3 / iter;
Table1[4,] = tmp4 / iter;

tmp1 = tmp2 = tmp3 = tmp4 = 0;
for(i in 1:iter){
	tmp1 = tmp1 + res[[i]]$Wald;
	tmp2 = tmp2 + res[[i]]$P
	
}
Table2[1,] = tmp1 / iter;
Table2[2,] = tmp2 / iter;

tmp1 = tmp2 = 0;
for(i in 1:iter){
	tmp1 = tmp1 + res[[i]]$Wald.Full;
	tmp2 = tmp2 + res[[i]]$Full;
}
Table3[1,] = tmp1 / iter;
Table3[2,] = tmp2 / iter;

## Create mean I(beta) for part b ##
IB = 0;
for(i in 1:iter){
	IB = IB + res[[i]]$Imat;
}
IB = IB / iter;


########################################################################################
####################       Problem 2b -- Some Problems Here         #################### 
########################################################################################
alpha = 0.05;

MLE = Table1[1,];
Var = Table1[2,];

prob2b = function(X, alpha, MLE, Var, IB){

	pijCI = matrix(NA, ncol = 2, nrow = sum(cj));
	pij.hat = exp(X %*% MLE) / (1 + exp(X %*% MLE));

	for(i in 1:length(pijCI)){
	
		pijCI[i,1] = pij.hat[i] - qnorm(alpha / 2) * 


	}
	
	LB = pij.hat - qnorm(alpha / 2) * (((exp(X %*% MLE) * X) / (1 + exp(X %*% MLE))^2)^2 %*% solve(IB));

}
	

########################################################################################
####################                     Problem 2c                 #################### 
########################################################################################
J = c(25, 50, 100, 200);
lambda = c(2, 4, 6, 8);
Beta = c(-1, 0.75, 1.3);
iter = 10;
alpha = 0.05;
Se = 0.95;
Sp = 0.98;
res = list();

## Create three arrays, one for each figure ##
## Each array has length(J) elements, consisting of ##
## matrix with length(p) rows, length(lambda) cols ##
## where each component is desired result to ##
## be plotted in figures ##
#arr1 = array(NA, c(length(p), length(lambda), length(J)));
#arr2 = array(NA, c(length(p), length(lambda), length(J)));
#arr3 = array(NA, c(length(p), length(lambda), length(J)));

for(j in 1:length(J)){

	for(l in 1:length(lambda)){

		cj = 2 + rpois(J[j], lambda[l]);
		X = matrix(NA, nrow = sum(cj), ncol = length(Beta));
		X[,1] = 1;
		X[,(2:length(Beta))] = rnorm((length(Beta) - 1) * sum(cj));
	
		pij = exp(X %*% Beta) / (1 + exp(X %*% Beta));

		pindex = NULL;
		for(k in 1:length(cj)){

			for(h in 1:cj[k]){
				pindex = append(pindex, k);
			}
		}

		for(i in 1:iter){

			data = rbinom(sum(cj), 1, pij);
			Zj = by(data, pindex, max);
			Zj = as.vector(Zj);
			Z = rbinom(length(Zj), 1, Se * Zj + (1 - Sp) * (1 - Zj));
			res[[i]] = prob2a(Z, X, cj, Se, Sp, alpha);


		}
	
		print(c(j,l));

	}


}


########################################################################################
####################                     Problem 2d                 #################### 
########################################################################################

data = read.table(file.choose());
data = data[,-1];
data = data[,-4];
names(data) = c("Response", "Pool Size", "Pool ID", "Age", "Gender", "Hours");
cj = data[,3];  ## this is really the pindex ##
Z = as.vector(by(data[,1], cj, max));
X = as.matrix(data[,(4:6)]);
X = cbind(rep(1, length(X[,2])), X);
Se = 0.95;
Sp = 0.95;
alpha = 0.05;

res = prob2a(Z, X, cj, Se, Sp, alpha);